---
title:  "Grandfathering balance table"
author: "Jack Gregory"
date:   "22 June 2024"
output:
  bookdown::html_document2:
    toc:          yes
    toc_depth:    2
    toc_float:    yes
    fig_caption:  yes
    df_print:     paged
    highlight:    textmate
    keep_md:      true
  pdf_document:
    latex_engine: pdflatex
    highlight:    haddock
    keep_md:      true
---
___

```{r preamble, include=FALSE}

## Initiate 
## ... Packages
pkgs <- c(
  "knitr",                            # Reproducible reporting
  "here",                             # File system
  "sf"                                # Spatial data
)
install.packages(setdiff(pkgs, rownames(installed.packages())))
lapply(pkgs, library, character.only = TRUE)
rm(pkgs)

knitr::opts_chunk$set(
  echo = TRUE,
  message=FALSE, 
  warning=FALSE,
  fig.topcaption=TRUE, 
  fig.show="hold",
  fig.align="center",
  fig.width=7, 
  fig.height=4.6)

source(here("src/preamble.R"))

## ... Files
l.file <- list(
  gf_raw = here("data/use_data/regression_vars.csv"),
  gf_cln = here("data/use_data/regressions_ready_data.dta"),
  epa_eia_xwalk = here("data/epa/epa_eia_crosswalk.csv"),
  gf_cems_xwalk = here("data/gf_cems_xwalk.xlsx"),
  epa = here("data/epa/Facility_Attributes.zip"),
  state = here("data/eia/shp/USA_States_Generalized.shp"),
  fig2 = here("out/fig2.pdf"),
  tbl5 = here("out/tbl5.tex"),
  tbl6 = here("out/tbl6.tex")
)
```

```{r theme, include=FALSE}

theme_maps <- function() {
  theme_void() +
  theme(legend.position = "right",
        legend.text = element_text(size=7),
        plot.caption = element_text(hjust=0))
} 
```


# Objective

This workbook forms a part of the analysis related to the Grandfathering project.  It prepares a data balance table using cleaned grandfathering data, EPA-EIA crosswalk, and EPA facility attributes.

The remainder of this workbook is organized as follows.  First, we import the necessary data.  Next, we clean the data while also providing a running commentary for replication.  Finally, we prepare the balance table and export it as a tex file.


# Import

We import grandfathering, EPA-EIA crosswalk, EPA facility and various spatial data.


## Boiler characteristics

We import the grandfathering dataset, i.e., the synthesis of boiler characteristics previously generated. 

```{r data-gf}

## Import GF dataset
df.gf_cln <- read_dta(l.file$gf_cln) %>%
  filter(!(year<1985 | year==2006 | year>2017)) %>%
  filter(plant_code!=10673) %>%                        ## Unit is in HI
  select(year,
         states,
         ORISPL = plant_code,
         BOILERID = boiler_id,
         Gf,
         ut_type,
         manufact,
         so2_nonattain,
         applic_reg,
         age,
         inservice_y,
         capacity,
         efficiency_100_pct_load,
         ARP_subject,
         ARPprice,
         survive,
         DURATION,
         GLOAD,
         HEAT,
         SO2_MASS,
         SO2,
         sulfur_dist)

## Extract utility type definitions
utility_type <- attributes(df.gf_cln$ut_type)$labels
manufact_type <- attributes(df.gf_cln$manufact)$labels

## Convert GF dataset to cross-section
df.gf_cln <- df.gf_cln %>%
  group_by(ORISPL, BOILERID) %>%
  mutate(SO2_RATE = SO2_MASS / HEAT,
         HEATRATE = HEAT / GLOAD) %>%
  mutate_at(vars(GLOAD, SO2_RATE, HEATRATE), ~ifelse(HEATRATE>15, NA, .)) %>%
  fill(ut_type, manufact, .direction="downup") %>%
  summarise(STATE = first(states),
            GF = mean(Gf, na.rm=TRUE),
            UTILITY_TYPE = first(ut_type),
            MANUFACT_TYPE = first(manufact),
            NONATTAIN = mean(so2_nonattain, na.rm=TRUE),
            ARP = max(ARP_subject, na.rm=TRUE),
            LOCALREG = mean(applic_reg, na.rm=TRUE),
            CAPACITY = min(capacity, na.rm=TRUE),
            EFFICIENCY = min(efficiency_100_pct_load, na.rm=TRUE),
            AGE = mean(age, na.rm=TRUE),
            INSERVICE = mean(inservice_y, na.rm=TRUE),
            RETIREMENT = max(year, na.rm=TRUE),
            RETIRE = sum((survive==0), na.rm=TRUE),
            DURATION = mean(DURATION, na.rm=TRUE),
            GLOAD = mean(GLOAD, na.rm=TRUE),
            HEATRATE = mean(HEATRATE, na.rm=TRUE),
            SO2_RATE = mean(SO2_RATE, na.rm=TRUE),
            SULFUR_DIST = mean(sulfur_dist, na.rm=TRUE),
            .groups="drop") %>%
  mutate_all(~ifelse(. %in% c(NaN, -Inf, Inf), NA, .)) %>%
  mutate(RETIREMENT = ifelse(RETIREMENT==2017 & RETIRE!=1, NA, RETIREMENT)) %>%
  ungroup()
```


## Plant coordinates

There are multiple sources for plant geographic locations:

- Grandfathering dataset;
- EPA facility attributes; and,
- EIA860.

Here, we rely on the grandfathering and EPA datasets.  We first import the grandfathering data, where we summarize it at the ORISPL level.

```{r data-gf-coord}

## Import gf coordinate data
df.gf_raw <- read_csv(l.file$gf_raw) %>%
  select(ORISPL = plant_code, 
       LAT = plt_latitude, 
       LON = plt_longitude,
       COUNTY_LAT = cty_lon,
       COUNTY_LON = cty_lat) %>%
  filter(ORISPL!=10673)

## Determine distinct ORISPL and county coordinates
df.gf_county <- df.gf_raw %>%
  distinct(ORISPL, COUNTY_LAT, COUNTY_LON) %>%
  group_by(ORISPL) %>%
  summarise_all(first) %>%
  ungroup()

## Create plant coordinate dataframe
df.gf_raw <- df.gf_raw %>%
  select(-starts_with("COUNTY_")) %>%
  mutate(LON = ifelse(LON>0, NA, LON)) %>%
  filter(!(is.na(LAT) | is.na(LON))) %>%
  distinct() %>%
  group_by(ORISPL) %>%
  summarise_all(first) %>%
  ungroup() %>%
  full_join(df.gf_county, by=c("ORISPL")) %>%
  arrange(ORISPL)
```

We next import the EPA data and summarize at the ORISPL level.

```{r data-epa}

## Unzip EPA facility data
epa_file <- zip::zip_list(l.file$epa) %>%
  filter(str_detect(filename, "^facility")) %>%
  pull(filename)
zip::unzip(l.file$epa, file=epa_file, exdir=here("data/epa"))

## Import EPA coordinate data
df.epa <- read_csv(here("data/epa", epa_file)) %>%
  select(ORISPL = `Facility ID (ORISPL)`, 
       LAT_EPA = `Facility Latitude`, 
       LON_EPA = `Facility Longitude`) %>%
  distinct() %>%
  arrange(ORISPL)

## Remove EPA facility data
fs::file_delete(here("data/epa", epa_file))
```

Finally, we import the EPA-EIA crosswalk data and summarize at the ORISPL level.

```{r data-epa-eia-xwalk}

df.epa_eia_xwalk <- read_csv(l.file$epa_eia_xwalk) %>%
  select(ORISPL = CAMD_PLANT_ID,
         CAMD_LATITUDE,
         CAMD_LONGITUDE,
         EIA_LATITUDE,
         EIA_LONGITUDE) %>%
  group_by(ORISPL) %>%
  summarise_at(vars(starts_with("CAMD_"), starts_with("EIA_")), first) %>%
  ungroup()
```


## State shapefiles

The state spatial data is sourced from the [EIA](https://atlas.eia.gov/datasets/esri::usa-states-generalized).  While, only the shp file is named below, note that its import also requires equivalently named dbf, prj, and shx files in the folder.

```{r sf-state, fig.cap="State shapefile"}

## Import state shapefile
df.state <- read_sf(l.file$state) %>%
  filter(!(STATE_ABBR %in% c("AK","HI")))

## Display coordinate reference system
st_crs(df.state)

## Plot
ggplot(data=df.state) +
  geom_sf(color="grey40", fill="grey90", size=0.3) +
  coord_sf(crs=sf::st_crs(2163))
```


# Cleaning

This sections cleans plant coordinates and boiler characteristics.


## Plant coordinates

We first construct the plant latitude and longitude coordinates using the following concordance:

1. Grandfathering plant coordinates;
2. EIA coordinates from EPA-EIA crosswalk;
3. EPA facility attributes; 
4. EPA coordinates from EPA-EIA crosswalk; and,
5. Grandfathering county centroids.

The base values are from the grandfathering dataset, which are sourced from EIA-860.  For any missing values, we replace them with coordinates from EPA facility attributes.  Finally, for any remaining missing values, we adopt county centroids as calculated in the grandfathering dataset.

This subsection produces Figure 2 in the paper.

```{r plant-coords}

## Clean plant coordinates and convert to sf
df.plant <- df.gf_raw %>%
  left_join(df.epa, by=c("ORISPL")) %>%
  left_join(df.epa_eia_xwalk, by=c("ORISPL")) %>%
  mutate(LAT = case_when(!is.na(LAT) ~ LAT,
                         is.na(LAT) & !is.na(EIA_LATITUDE) ~ EIA_LATITUDE,
                         is.na(LAT) & is.na(EIA_LATITUDE) & !is.na(LAT_EPA) ~ LAT_EPA,
                         is.na(LAT) & is.na(EIA_LATITUDE) & is.na(LAT_EPA) & !is.na(EIA_LATITUDE) ~
                           EIA_LATITUDE,
                         TRUE ~ COUNTY_LAT),
         LON = case_when(!is.na(LON) ~ LON,
                         is.na(LON) & !is.na(EIA_LONGITUDE) ~ EIA_LONGITUDE,
                         is.na(LON) & is.na(EIA_LONGITUDE) & !is.na(LON_EPA) ~ LON_EPA,
                         is.na(LON) & is.na(EIA_LONGITUDE) & is.na(LON_EPA) & !is.na(EIA_LONGITUDE) ~
                           EIA_LONGITUDE,
                         TRUE ~ COUNTY_LON),
         LATITUDE = LAT,
         LONGITUDE = LON) %>%
  select(ORISPL, LAT, LON, LATITUDE, LONGITUDE) %>%
  right_join(df.gf_cln %>%
              group_by(ORISPL) %>%
              summarise(N = n(),
                        GF_MAX = max(GF, na.rm=TRUE),
                        GF_MEAN = mean(GF, na.rm=TRUE),
                        .groups="drop"),
            by=c("ORISPL")) %>%
  mutate(GF_MAX = factor(GF_MAX, levels=c(1,0), labels=c("GF","Non-GF")),
         GF_GRP = ifelse(GF_MEAN>0 & GF_MEAN<1, 2, GF_MEAN),
         GF_GRP = factor(GF_GRP, levels=c(1,2,0), labels=c("GF","Both","Non-GF"))) %>%
  st_as_sf(coords = c("LONGITUDE","LATITUDE"), crs=st_crs(df.state), agr="constant")

## Display coordinate reference system
st_crs(df.plant)

## Plots
ggplot() +
  geom_sf(data=df.state, fill="grey85", color="grey95", size=0.3) +
  geom_sf(data=df.plant, aes(color=GF_MAX, size=N)) +
  scale_color_viridis_d(begin=0.25, end=0.7, alpha=0.35) +
  scale_size(breaks=c(5,10)) +
  coord_sf(crs=sf::st_crs(2163)) +
  labs(#title="Coal-fired power plants",
       color="NSR",
       size="Boilers") +
  theme_maps() +
  guides(color = guide_legend(order=1, override.aes=list(size=3.5, alpha=0.5)),
         size = guide_legend(order=2, override.aes=list(shape=1, alpha=0.5)))

ggplot() +
  geom_sf(data=df.state, fill="grey85", color="grey95", size=0.3) +
  geom_sf(data=df.plant, aes(color=GF_GRP, size=N)) +
  scale_color_viridis_d(begin=0.1, end=0.85, alpha=0.35) +
  scale_size(breaks=c(5,10)) +
  coord_sf(crs=sf::st_crs(2163)) +
  labs(#title="Coal-fired power plants",
       color="NSR",
       size="Boilers") +
  theme_maps() +
  guides(color = guide_legend(order=1, override.aes=list(size=3.5, alpha=0.5)),
         size = guide_legend(order=2, override.aes=list(shape=1, alpha=0.5)))

ggplot() +
  geom_sf(data=df.state, fill="grey99", color="grey70", size=0.3) +
  geom_sf(data=df.plant, aes(color=GF_MEAN, size=N)) +
  scale_color_viridis_c(direction=-1, begin=0.1, end=0.8, alpha=0.4) +
  scale_size(breaks=c(5,10)) +
  coord_sf(crs=sf::st_crs(2163)) +
  labs(#title="Coal-fired power plants",
       color="Share NSR\ngrandfathered",
       size="Boilers") +
  theme_maps() +
  guides(color = guide_legend(order=1, override.aes=list(size=3.5, alpha=0.5)),
         size = guide_legend(order=2, override.aes=list(shape=1, alpha=0.5)))

ggsave(l.file$fig2, width=8, height=4.25, units="in")
```


## Boiler characteristics

Next, we join plant coordinates with the grandfathering dataset.

```{r boiler-chars}

df.boiler <- df.gf_cln %>%
  left_join(df.plant %>%
              as_tibble() %>% 
              select(-N, -starts_with("GF"), -geometry), 
            by=c("ORISPL"))
```


# Analysis

This section prepares balance tables for the paper.


## Grandfathering

This subsection produces Table 5 in the paper.

```{r analysis}

## NB - See <http://www.stat.yale.edu/Courses/1997-98/101/meancomp.htm>


levels <- c("NONATTAIN","ARP","LOCALREG","LAT","LON","AGE","INSERVICE","RETIREMENT","RETIRE","CAPACITY",
            "EFFICIENCY","DURATION","GLOAD","HEATRATE","SO2_RATE","SULFUR_DIST","UT_CI","UT_IOU","UT_OTH",
            "MT_CE","MT_FW","MT_RS","MT_OTH","N")
labels <- c("Nonattainment",
            "Acid Rain Program",
            "Applicable Non-NSR Emission Standard [lbs/MMBtu]",
            "Latitude",
            "Longitude",
            "Age",
            "Inservice year",
            "Retirement year",
            "Retire",
            "Capacity",
            "Efficiency",
            "Duration$^{\\dagger}$ [hr/yr]",
            "Generation$^{\\dagger}$ [GWh/yr]",
            "Heat rate [mmBTu/MW]",
            "SO2 emissions$^{\\dagger}$ [lbs/MMBtu]",
            "Weighted average mine sulfur content [\\% weight]",
            "Commercial \\& industrial utility share",
            "Investor-owned utility share",
            "Other utility share",
            "CE manufacturing share",
            "FW manufacturing share",
            "RS manufacturing share",
            "Other manufacturing share",
            "N")

df.bal_n <- df.boiler %>%
  filter(INSERVICE>=1950 & INSERVICE<=2004) %>%
  group_by(GF) %>%
  summarise(N = n(), .groups="drop") %>%
  mutate(N = formatC(N, format="f", digits=0, big.mark=",")) %>%
  pivot_wider(names_from=GF, names_prefix="GF_", values_from="N") %>%
  mutate(VAR = "N",
         ORDER = 1) %>%
  select(VAR, ORDER, GF_1, GF_0)

df.bal <- df.boiler %>%
  select(-HEATRATE) %>%
  mutate(GLOAD = GLOAD/10^3,
         UT_CI = (UTILITY_TYPE %in% c(2,4)),
         UT_IOU = (UTILITY_TYPE==5),
         UT_OTH = (!(UTILITY_TYPE %in% c(2,4,5))),
         MT_CE = (MANUFACT_TYPE==4),
         MT_FW = (MANUFACT_TYPE==7),
         MT_RS = (MANUFACT_TYPE==17),
         MT_OTH = (!(MANUFACT_TYPE %in% c(4,7,17)))) %>%
  select(-ORISPL, -BOILERID, -STATE, -UTILITY_TYPE, -MANUFACT_TYPE) %>%
  pivot_longer(cols=-GF, names_to="VAR", values_to="VAL") %>%
  group_by(VAR, GF) %>%
  summarise(MEAN = mean(VAL, na.rm=TRUE),
            SD = sd(VAL, na.rm=TRUE),
            N = sum(!is.na(VAL), na.rm=TRUE),
            .groups="drop") %>%
  pivot_wider(id_cols=VAR, names_from=GF, values_from=c(MEAN, SD, N)) %>%
  mutate(DIFF = MEAN_1 - MEAN_0,
         TSTAT = (MEAN_1 - MEAN_0)/sqrt(SD_0^2/N_0 + SD_1^2/N_1),
         SIG = case_when(abs(TSTAT)>=3.291 ~ "^{***}",
                         abs(TSTAT)>=2.576 ~ "^{**}",
                         abs(TSTAT)>=1.960 ~ "^{*}",
                         TRUE ~ "")) %>%
  mutate_at(vars(matches("MEAN|SD|DIFF|TSTAT")), ~formatC(., format="f", digits=2, big.mark=",")) %>%
  mutate_at(vars(starts_with("SD")), ~glue("({.})")) %>%
  mutate(DIFF = glue("{DIFF}{SIG}"),
         TSTAT = glue("[{TSTAT}]")) %>%
  select(-starts_with("N_"), -SIG) %>%
  pivot_longer(cols=matches("MEAN|SD|DIFF|TSTAT"), names_to="STAT", values_to="VAL") %>%
  separate(STAT, into=c("STAT","GF"), sep="_") %>%
  mutate(ORDER = case_when(STAT %in% c("MEAN","DIFF") ~ 1,
                           TRUE ~ 2)) %>%
  pivot_wider(id_cols=c(VAR, ORDER), names_from=GF, names_prefix="GF_", values_from=VAL) %>%
  rename(DIFF = GF_NA) %>%
  bind_rows(df.bal_n) %>%
  mutate(VAR = factor(VAR, levels=levels, labels=labels)) %>%
  arrange(VAR, ORDER) %>%
  mutate(VAR = as.character(VAR),
         VAR = case_when(VAR=="Applicable Non-NSR Emission Standard [lbs/MMBtu]" & ORDER==1 ~ 
                           "Applicable Non-NSR Emission",
                         VAR=="Applicable Non-NSR Emission Standard [lbs/MMBtu]" & ORDER==2 ~ 
                           " Standard [lbs/MMBtu]",
                         VAR=="Weighted average mine sulfur content [\\% weight]" & ORDER==1 ~ 
                           "Weighted average mine sulfur",
                         VAR=="Weighted average mine sulfur content [\\% weight]" & ORDER==2 ~ 
                           " content [\\% weight]",
                         VAR=="Commercial \\& industrial utility share" & ORDER==1 ~ 
                           "Commercial \\& industrial utility",
                         VAR=="Commercial \\& industrial utility share" & ORDER==2 ~ 
                           " share",
                         ORDER==2 ~ as.character(NA),
                         TRUE ~ VAR)) %>%
  mutate_all(~ifelse(is.na(.), "", .)) %>%
  select(ORDER, VAR, GF_1, GF_0, DIFF)
```

```{r balance-tbl}

if (knitr::is_html_output()) {
  df.bal %>%
    kable(format="html", align="lrrr", booktabs=TRUE, linesep=c(""),
          caption="Average characteristics of boilers by NSR grandfathering status",
          col.names=c("","Grandfathered","Non-Granfathered","Difference")) 
} else {
  df.bal
}
```

This section exports the table to a tex file.

```{r export}

## Build regression table (longtable) -----------------------------------------

## Construct table coefficients
tbl_coef <- df.bal %>%
  filter(VAR!="N") %>%
  mutate_at(vars(last_col()), ~case_when(ORDER==1 ~ paste0(., " \\\\"),
                                         TRUE ~ paste0(., " \\\\ [1.0ex]"))) %>%
  mutate_at(vars(2:last_col(offset=1)), ~paste0(., " &")) %>%
  select(-ORDER)

## Construct table summary
tbl_summary <- df.bal %>%
  filter(VAR=="N") %>%
  mutate(VAR = "Number of Boilers") %>%
  mutate_at(vars(2:last_col(offset=1)), ~paste0(., " &")) %>%
  mutate_at(vars(last_col()), ~paste0(., " \\\\")) %>%
  select(-ORDER)

## Prepare file header and table header, midder and footer
f_header <- c("%% Grandfathering Project",
              "%% Balance table",
              glue::glue("%% Compiled on {base::Sys.time()} using <balance.Rmd>"),
              "%% Written by Jack Gregory",
              "\n")
tbl_header <- c("\\begin{center}",
                "\\begin{singlespace}",
                "\\begin{footnotesize}\n",
                "\\begin{longtable}[c]{@{\\extracolsep{0.7ex}}l*{3}{D{.}{.}{-2}}@{}}",
                "\t\\caption{Average characteristics of boilers by NSR grandfathering status}",
                "\t\\label{tbl:balance}\n",
                "\\\\",
                "\\hline\\hline",
                paste0("\\textbf{Variable} & \\multicolumn{1}{c}{\\textbf{Grandfathered}} & ",
                       "\\multicolumn{1}{c}{\\textbf{Non-Grandfathered}} & ",
                       "\\multicolumn{1}{c}{\\textbf{Difference}} \\\\ [1.0ex]"),
                "\\midrule",
                "\\endfirsthead",
                "\\hline\\hline",
                paste0("\\textbf{Variable} & \\multicolumn{1}{c}{\\textbf{Grandfathered}} & ",
                       "\\multicolumn{1}{c}{\\textbf{Non-Grandfathered}} & ",
                       "\\multicolumn{1}{c}{\\textbf{Difference}} \\\\ [1.0ex]"),
                "\\midrule",
                "\\endhead")
tbl_midder <- c("\\midrule")
tbl_footer <- c("\\hline\\hline",
                paste0("\\multicolumn{4}{p{14.1cm}}{\\textit{Notes}:  This table displays average ",
                       "characteristics by NSR grandfathering status.  Standard deviations are in ",
                       "parentheses, with \\textit{t}-statistics of the difference between ",
                       "`grandfathered' and `non-grandfathered' boilers in brackets where ",
                       "*** p<0.001;  ** p<0.01;  * p<0.05.  All variables utilize boilers with inservice years ",
                       "between 1950 and 2004 inclusive, except for those from CEMS which are from 1997 onwards and ",
                       "are identified with a $^{\\dagger}$. Each row is a separate calculation, and ",
                       "is not conditional on the other variables reported here.} \\\\"),
                "\\end{longtable}\n",
                "\\end{footnotesize}",
                "\\end{singlespace}",
                "\\end{center}\n")

## Output TeX table file
f_header %>% write_lines(l.file$tbl5, append=FALSE)
tbl_header %>% write_lines(l.file$tbl5, append=TRUE)
tbl_coef %>% write_tsv(l.file$tbl5, 
                       append=TRUE, col_names=FALSE, escape="none")
tbl_midder %>% write_lines(l.file$tbl5, append=TRUE)
tbl_summary %>% write_tsv(l.file$tbl5, 
                          append=TRUE, col_names=FALSE, escape="none")
tbl_footer %>% write_lines(l.file$tbl5, append=TRUE)
```


## Local reg > 1.1

This subsection produces Table 6 in the paper.

```{r}

df.boiler11 <- df.boiler |>
  mutate(LR = ifelse(LOCALREG<1.1, 1, 0))

df.bal_n <- df.boiler11 %>%
  group_by(LR) %>%
  summarise(N = n(), .groups="drop") %>%
  mutate(N = formatC(N, format="f", digits=0, big.mark=",")) %>%
  pivot_wider(names_from=LR, names_prefix="LR_", values_from="N") %>%
  mutate(VAR = "N",
         ORDER = 1) %>%
  select(VAR, ORDER, LR_1, LR_0)

df.bal <- df.boiler11 %>%
  select(-GF, -HEATRATE) %>%
  mutate(GLOAD = GLOAD/10^3,
         UT_CI = (UTILITY_TYPE %in% c(2,4)),
         UT_IOU = (UTILITY_TYPE==5),
         UT_OTH = (!(UTILITY_TYPE %in% c(2,4,5))),
         MT_CE = (MANUFACT_TYPE==4),
         MT_FW = (MANUFACT_TYPE==7),
         MT_RS = (MANUFACT_TYPE==17),
         MT_OTH = (!(MANUFACT_TYPE %in% c(4,7,17)))) %>%
  select(-ORISPL, -BOILERID, -STATE, -UTILITY_TYPE, -MANUFACT_TYPE) %>%
  pivot_longer(cols=-LR, names_to="VAR", values_to="VAL") %>%
  group_by(VAR, LR) %>%
  summarise(MEAN = mean(VAL, na.rm=TRUE),
            SD = sd(VAL, na.rm=TRUE),
            N = sum(!is.na(VAL), na.rm=TRUE),
            .groups="drop") %>%
  pivot_wider(id_cols=VAR, names_from=LR, values_from=c(MEAN, SD, N)) %>%
  mutate(DIFF = MEAN_1 - MEAN_0,
         TSTAT = (MEAN_1 - MEAN_0)/sqrt(SD_0^2/N_0 + SD_1^2/N_1),
         SIG = case_when(abs(TSTAT)>=3.291 ~ "^{***}",
                         abs(TSTAT)>=2.576 ~ "^{**}",
                         abs(TSTAT)>=1.960 ~ "^{*}",
                         TRUE ~ "")) %>%
  mutate_at(vars(matches("MEAN|SD|DIFF|TSTAT")), ~formatC(., format="f", digits=2, big.mark=",")) %>%
  mutate_at(vars(starts_with("SD")), ~glue("({.})")) %>%
  mutate(DIFF = glue("{DIFF}{SIG}"),
         TSTAT = glue("[{TSTAT}]")) %>%
  select(-starts_with("N_"), -SIG) %>%
  pivot_longer(cols=matches("MEAN|SD|DIFF|TSTAT"), names_to="STAT", values_to="VAL") %>%
  separate(STAT, into=c("STAT","LR"), sep="_") %>%
  mutate(ORDER = case_when(STAT %in% c("MEAN","DIFF") ~ 1,
                           TRUE ~ 2)) %>%
  pivot_wider(id_cols=c(VAR, ORDER), names_from=LR, names_prefix="LR_", values_from=VAL) %>%
  rename(DIFF = LR_NA) %>%
  bind_rows(df.bal_n) %>%
  mutate(VAR = factor(VAR, levels=levels, labels=labels)) %>%
  arrange(VAR, ORDER) %>%
  mutate(VAR = as.character(VAR),
         VAR = case_when(VAR=="Applicable Non-NSR Emission Standard [lbs/MMBtu]" & ORDER==1 ~ 
                           "Applicable Non-NSR Emission",
                         VAR=="Applicable Non-NSR Emission Standard [lbs/MMBtu]" & ORDER==2 ~ 
                           " Standard [lbs/MMBtu]",
                         VAR=="Weighted average mine sulfur content [\\% weight]" & ORDER==1 ~ 
                           "Weighted average mine sulfur",
                         VAR=="Weighted average mine sulfur content [\\% weight]" & ORDER==2 ~ 
                           " content [\\% weight]",
                         VAR=="Commercial \\& industrial utility share" & ORDER==1 ~ 
                           "Commercial \\& industrial utility",
                         VAR=="Commercial \\& industrial utility share" & ORDER==2 ~ 
                           " share",
                         ORDER==2 ~ as.character(NA),
                         TRUE ~ VAR)) %>%
  mutate_all(~ifelse(is.na(.), "", .)) %>%
  select(ORDER, VAR, LR_1, LR_0, DIFF)
```

```{r balance-tbl}

if (knitr::is_html_output()) {
  df.bal %>%
    kable(format="html", align="lrrr", booktabs=TRUE, linesep=c(""),
          caption="Average characteristics of boilers by local regulation",
          col.names=c("","Local regulation > 1.1","Local regulation <= 1.1.","Difference")) 
} else {
  df.bal
}
```

This section exports the table to a tex file.

```{r export}

## Build regression table (longtable) -----------------------------------------

## Construct table coefficients
tbl_coef <- df.bal %>%
  filter(VAR!="N") %>%
  mutate_at(vars(last_col()), ~case_when(ORDER==1 ~ paste0(., " \\\\"),
                                         TRUE ~ paste0(., " \\\\ [1.0ex]"))) %>%
  mutate_at(vars(2:last_col(offset=1)), ~paste0(., " &")) %>%
  select(-ORDER)

## Construct table summary
tbl_summary <- df.bal %>%
  filter(VAR=="N") %>%
  mutate(VAR = "Number of Boilers") %>%
  mutate_at(vars(2:last_col(offset=1)), ~paste0(., " &")) %>%
  mutate_at(vars(last_col()), ~paste0(., " \\\\")) %>%
  select(-ORDER)

## Prepare file header and table header, midder and footer
f_header <- c("%% Grandfathering Project",
              "%% Balance table",
              glue::glue("%% Compiled on {base::Sys.time()} using <balance.Rmd>"),
              "%% Written by Jack Gregory",
              "\n")
tbl_header <- c("\\begin{center}",
                "\\begin{singlespace}",
                "\\begin{footnotesize}\n",
                "\\begin{longtable}[c]{@{\\extracolsep{0.7ex}}l*{3}{D{.}{.}{-2}}@{}}",
                "\t\\caption{Average characteristics of boilers by local regulation}",
                "\t\\label{tbl:balance_lr11}\n",
                "\\\\",
                "\\hline\\hline",
                paste0("\\textbf{Variable} & \\multicolumn{1}{c}{\\textbf{\\makecell{Local regulation\\\\> 1.1}}} & ",
                       "\\multicolumn{1}{c}{\\textbf{\\makecell{Local regulation\\\\$\\leq$ 1.1}}} & ",
                       "\\multicolumn{1}{c}{\\textbf{Difference}} \\\\ [1.0ex]"),
                "\\midrule",
                "\\endfirsthead",
                "\\hline\\hline",
                paste0("\\textbf{Variable} & \\multicolumn{1}{c}{\\textbf{\\makecell{Local regulation\\\\> 1.1}}} & ",
                       "\\multicolumn{1}{c}{\\textbf{\\makecell{Local regulation\\\\$\\leq$ 1.1}}} & ",
                       "\\multicolumn{1}{c}{\\textbf{Difference}} \\\\ [1.0ex]"),
                "\\midrule",
                "\\endhead")
tbl_midder <- c("\\midrule")
tbl_footer <- c("\\hline\\hline",
                paste0("\\multicolumn{4}{p{14.1cm}}{\\textit{Notes}:  This table displays average ",
                       "characteristics by local regulation at a cutoff of 1.1 lbs/MMBtu.  Standard deviations are in ",
                       "parentheses, with \\textit{t}-statistics of the difference between ",
                       "`grandfathered' and `non-grandfathered' boilers in brackets where ",
                       "*** p<0.001;  ** p<0.01;  * p<0.05.  All variables utilize the full extent ",
                       "of our dataset, except for those from CEMS which are from 1997 onwards and ",
                       "are identified with a $^{\\dagger}$. Each row is a separate calculation, and ",
                       "is not conditional on the other variables reported here.} \\\\"),
                "\\end{longtable}\n",
                "\\end{footnotesize}",
                "\\end{singlespace}",
                "\\end{center}\n")

## Output TeX table file
f_header %>% write_lines(l.file$tbl6, append=FALSE)
tbl_header %>% write_lines(l.file$tbl6, append=TRUE)
tbl_coef %>% write_tsv(l.file$tbl6, 
                       append=TRUE, col_names=FALSE, escape="none")
tbl_midder %>% write_lines(l.file$tbl6, append=TRUE)
tbl_summary %>% write_tsv(l.file$tbl6, 
                          append=TRUE, col_names=FALSE, escape="none")
tbl_footer %>% write_lines(l.file$tbl6, append=TRUE)
```

